import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', 1000)
pd.set_option('max_rows',40)
pd.set_option('max_columns',1000)
import seaborn as sns
sns.set_style('darkgrid')
#data = pd.read_csv("../input/loan-prediction-analytics-vidhya/train_ctrUa4K.csv")
#test = pd.read_csv("../input/loan-prediction-analytics-vidhya/test_lAUu6dG.csv")
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sns.countplot(x="Loan_Status", data=data)
<AxesSubplot:xlabel='Loan_Status', ylabel='count'>
Numeric Columns
Categorical Columns
num_cols = data.select_dtypes(exclude='object').copy()
cat_cols = data.select_dtypes(include='object').copy()
cat_cols = cat_cols.drop('Loan_ID',axis=1)
num_cols.head()
| ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | |
|---|---|---|---|---|---|
| 0 | 5849 | 0.0 | NaN | 360.0 | 1.0 |
| 1 | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 |
| 2 | 3000 | 0.0 | 66.0 | 360.0 | 1.0 |
| 3 | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 |
| 4 | 6000 | 0.0 | 141.0 | 360.0 | 1.0 |
cat_cols.head()
| Gender | Married | Dependents | Education | Self_Employed | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|
| 0 | Male | No | 0 | Graduate | No | Urban | Y |
| 1 | Male | Yes | 1 | Graduate | No | Rural | N |
| 2 | Male | Yes | 0 | Graduate | Yes | Urban | Y |
| 3 | Male | Yes | 0 | Not Graduate | No | Urban | Y |
| 4 | Male | No | 0 | Graduate | No | Urban | Y |
fig = plt.figure(figsize=(12,16))
for index,col in enumerate(num_cols):
plt.subplot(3,2,index+1)
sns.distplot(num_cols.loc[:,col].dropna(),kde=False,bins=30)
fig.tight_layout(pad=1.0)
data['Loan_Amount_Term'].value_counts()
360.0 512 180.0 44 480.0 15 300.0 13 84.0 4 240.0 4 120.0 3 36.0 2 60.0 2 12.0 1 Name: Loan_Amount_Term, dtype: int64
fig = plt.figure(figsize=(12,16))
for index,col in enumerate(num_cols):
plt.subplot(3,2,index+1)
sns.boxplot(data=num_cols.dropna(), y=col)
fig.tight_layout(pad=1.0)
fig = plt.figure(figsize=(18,20))
for index in range(len(cat_cols.columns)):
plt.subplot(2,4,index+1)
sns.countplot(x=cat_cols.iloc[:,index], data=cat_cols.dropna())
# plt.xticks(rotation=90)
fig.tight_layout(pad=1.0)
# Checking Null values
plt.figure(figsize=(10,8))
sns.heatmap(data.isnull(),cmap='plasma')
def percent_missing(d):
percentage = pd.DataFrame(100*(d.isnull().sum()/len(d)),columns=['Missing_%']).sort_values('Missing_%',ascending=False)
return (percentage.head(10))
percent_missing(data)
| Missing_% | |
|---|---|
| Credit_History | 8.143322 |
| Self_Employed | 5.211726 |
| LoanAmount | 3.583062 |
| Dependents | 2.442997 |
| Loan_Amount_Term | 2.280130 |
| Gender | 2.117264 |
| Married | 0.488599 |
| Loan_ID | 0.000000 |
| Education | 0.000000 |
| ApplicantIncome | 0.000000 |
#Mapp the Y/N to 1/0
mapp = {'Y':1,'N':0}
data["Loan_Status"] = data["Loan_Status"].map(mapp)
corr = data.corr()
sns.heatmap(data=corr.dropna(),cmap='Blues',linewidth=0.5)
<AxesSubplot:>
sns.countplot(x='Loan_Status',data=data,hue='Credit_History')
<AxesSubplot:xlabel='Loan_Status', ylabel='count'>
corr["Loan_Status"]
ApplicantIncome -0.004710 CoapplicantIncome -0.059187 LoanAmount -0.037318 Loan_Amount_Term -0.021268 Credit_History 0.561678 Loan_Status 1.000000 Name: Loan_Status, dtype: float64
fig = plt.figure(figsize=(18,20))
for index in range(len(cat_cols.columns)):
plt.subplot(2,4,index+1)
sns.countplot(x=cat_cols.iloc[:,index], data=cat_cols.dropna(),hue=data['Loan_Status'])
# plt.xticks(rotation=90)
fig.tight_layout(pad=1.0)
Outliers
data = data[data['ApplicantIncome'] < 50000]
data = data[data['LoanAmount'] < 500]
Mapping
mapp_gender = {'Male':0,'Female':1}
mapp_married = {'No':0,'Yes':1}
mapp_dep = {'0':0,'1':1,'2':1,'3+':1}
mapp_edu = {'Not Graduate':0,'Graduate':1}
mapp_se = {'No':0,'Yes':1}
mapp_pa = {'Semiurban':1,'Urban':2,'Rural':3}
data['Gender'] = data['Gender'].map(mapp_gender)
data['Married'] = data['Married'].map(mapp_married)
data['Dependents'] = data['Dependents'].map(mapp_dep)
data['Education'] = data['Education'].map(mapp_edu)
data['Self_Employed'] = data['Self_Employed'].map(mapp_se)
data['Property_Area'] = data['Property_Area'].map(mapp_pa)
test['Gender'] = test['Gender'].map(mapp_gender)
test['Married'] = test['Married'].map(mapp_married)
test['Dependents'] = test['Dependents'].map(mapp_dep)
test['Education'] = test['Education'].map(mapp_edu)
test['Self_Employed'] = test['Self_Employed'].map(mapp_se)
test['Property_Area'] = test['Property_Area'].map(mapp_pa)
mapp_lt = {360.0 :1,180.0 : 0,480.0:0,300.0:0,84.0: 0,240.0: 0,120.0: 0,36.0: 0,60.0: 0,12.0:0}
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].map(mapp_lt)
test['Loan_Amount_Term'] = test['Loan_Amount_Term'].map(mapp_lt)
corr = data.corr()
sns.heatmap(data=corr.dropna(),mask = corr < 0.8,cmap='Blues',linewidth=0.5)
<AxesSubplot:>
No Collinearity btw other features
loan_amt_avg = test.groupby('ApplicantIncome').mean()['LoanAmount']
def fill_loan_amt(LoanAmount,ApplicantIncome):
if np.isnan(LoanAmount):
return loan_amt_avg[ApplicantIncome]
else:
return LoanAmount
test['LoanAmount'] = test.apply(lambda x: fill_loan_amt(x['LoanAmount'], x['ApplicantIncome']), axis=1)
test['LoanAmount'] = test['LoanAmount'].fillna(136.9586777)
loan_amt_avg = data.groupby('ApplicantIncome').mean()['LoanAmount']
def fill_loan_amt(LoanAmount,ApplicantIncome):
if np.isnan(LoanAmount):
return loan_amt_avg[ApplicantIncome]
else:
return LoanAmount
data['LoanAmount'] = data.apply(lambda x: fill_loan_amt(x['LoanAmount'], x['ApplicantIncome']), axis=1)
data['LoanAmount'] = data['LoanAmount'].fillna(146.4121622)
test['Dependents'] = test['Dependents'].fillna(1)
data['Dependents'] = data['Dependents'].fillna(1)
def fill_gender(Gender,Married):
if np.isnan(Gender):
if Married ==1:
return (1)
else:
return (0)
else:
return Gender
test['Gender'] = test.apply(lambda x: fill_gender(x['Gender'], x['Married']), axis=1)
data['Gender'] = data.apply(lambda x: fill_gender(x['Gender'], x['Married']), axis=1)
def fill_self_emp(Self_Employed,ApplicantIncome):
if np.isnan(Self_Employed):
if ApplicantIncome >=7380:
return (1)
else:
return (0)
else:
return Self_Employed
test['Self_Employed'] = test.apply(lambda x: fill_self_emp(x['Self_Employed'], x['ApplicantIncome']), axis=1)
data['Self_Employed'] = data.apply(lambda x: fill_self_emp(x['Self_Employed'], x['ApplicantIncome']), axis=1)
test['Loan_Amount_Term'] = test['Loan_Amount_Term'].fillna(0)
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(0)
test['Credit_History'] = test['Credit_History'].fillna(-999)
data = data.dropna()
def fill_CoapplicantIncome(CoapplicantIncome):
if CoapplicantIncome ==0:
return (1)
else:
return (0)
data['CoapplicantIncome'] = data.apply(lambda x: fill_CoapplicantIncome(x['CoapplicantIncome']), axis=1)
test['CoapplicantIncome'] = test.apply(lambda x: fill_CoapplicantIncome(x['CoapplicantIncome']), axis=1)
data = data.fillna(-999)
test = test.fillna(-999)
mapp = {1:'Y',0:'N'}
data["Loan_Status"] = data["Loan_Status"].map(mapp)
X_train = data.drop(['Loan_ID','Loan_Status'],axis=1)
y_train = data['Loan_Status']
y_train.value_counts()
Y 368 N 166 Name: Loan_Status, dtype: int64
406/178
2.2808988764044944
import optuna
def objective(trial):
params = {
'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
'max_depth': trial.suggest_int('max_depth', 6, 13),
'learning_rate': trial.suggest_uniform('learning_rate', 0.009, 0.10),
'subsample': trial.suggest_uniform('subsample', 0.50, 1),
'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
'gamma': trial.suggest_int('gamma', 0, 0.05),
'missing': -999,
'scale_pos_weight':2.28,
'eval_metric': 'auc',
#'scale_pos_weight': 0.48,
}
clf = xgb.XGBClassifier(**params)
auccuracies = []
X_train_k = X_train.values
y_train_k = y_train.values
kf = KFold(n_splits=3,random_state=2000,shuffle=True)
for train_idx, valid_idx in kf.split(X_train_k,y_train_k):
# X_train_k, X_test = X_train[train_index], X_train[test_index]
# y_train_k, y_test = y_train[train_index], y_train[test_index]
train_data = X_train_k[train_idx, :], y_train_k[train_idx]
valid_data = X_train_k[valid_idx, :], y_train_k[valid_idx]
clf.fit(X_train_k[train_idx, :], y_train_k[train_idx])
pred = clf.predict(X_train_k[valid_idx, :])
accuracy = accuracy_score(y_train_k[valid_idx],pred)
auccuracies.append(accuracy)
print(f'Trial done: Accuracy values on folds: {auccuracies}')
return np.average(auccuracies)
n_trials = 10
FIT_XGB = True
if FIT_XGB:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print(" Value: {}".format(trial.value))
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
[I 2021-01-18 12:54:34,245] A new study created in memory with name: no-name-cdb43c2d-0279-42c0-bde9-9370deb33cde [I 2021-01-18 12:54:39,777] Trial 0 finished with value: 0.7902621722846442 and parameters: {'n_estimators': 971, 'max_depth': 13, 'learning_rate': 0.02804436130054331, 'subsample': 0.7100857890668766, 'colsample_bytree': 0.9282946530454147, 'gamma': 0}. Best is trial 0 with value: 0.7902621722846442.
Trial done: Accuracy values on folds: [0.7696629213483146, 0.7808988764044944, 0.8202247191011236]
[I 2021-01-18 12:54:44,838] Trial 1 finished with value: 0.7808988764044944 and parameters: {'n_estimators': 780, 'max_depth': 6, 'learning_rate': 0.04617851211266668, 'subsample': 0.8499144110388942, 'colsample_bytree': 0.8161804797537293, 'gamma': 0}. Best is trial 0 with value: 0.7902621722846442.
Trial done: Accuracy values on folds: [0.7528089887640449, 0.7696629213483146, 0.8202247191011236]
[I 2021-01-18 12:54:49,086] Trial 2 finished with value: 0.7940074906367042 and parameters: {'n_estimators': 630, 'max_depth': 12, 'learning_rate': 0.03605900595856281, 'subsample': 0.8552688103457784, 'colsample_bytree': 0.959344091561088, 'gamma': 0}. Best is trial 2 with value: 0.7940074906367042.
Trial done: Accuracy values on folds: [0.7808988764044944, 0.7752808988764045, 0.8258426966292135]
[I 2021-01-18 12:54:53,406] Trial 3 finished with value: 0.7827715355805243 and parameters: {'n_estimators': 958, 'max_depth': 13, 'learning_rate': 0.0556411167157031, 'subsample': 0.6490181861933666, 'colsample_bytree': 0.9508217184763408, 'gamma': 0}. Best is trial 2 with value: 0.7940074906367042.
Trial done: Accuracy values on folds: [0.7696629213483146, 0.7640449438202247, 0.8146067415730337]
[I 2021-01-18 12:54:54,691] Trial 4 finished with value: 0.7921348314606741 and parameters: {'n_estimators': 391, 'max_depth': 12, 'learning_rate': 0.05850370640475303, 'subsample': 0.6411637577560136, 'colsample_bytree': 0.8328898129294011, 'gamma': 0}. Best is trial 2 with value: 0.7940074906367042.
Trial done: Accuracy values on folds: [0.7640449438202247, 0.7921348314606742, 0.8202247191011236]
[I 2021-01-18 12:54:56,702] Trial 5 finished with value: 0.7883895131086143 and parameters: {'n_estimators': 723, 'max_depth': 9, 'learning_rate': 0.08571131568948556, 'subsample': 0.9700382279538224, 'colsample_bytree': 0.7469601416089089, 'gamma': 0}. Best is trial 2 with value: 0.7940074906367042.
Trial done: Accuracy values on folds: [0.7696629213483146, 0.7752808988764045, 0.8202247191011236]
[I 2021-01-18 12:54:58,290] Trial 6 finished with value: 0.7883895131086143 and parameters: {'n_estimators': 549, 'max_depth': 6, 'learning_rate': 0.07014171716719257, 'subsample': 0.6792047088518043, 'colsample_bytree': 0.8349760755492892, 'gamma': 0}. Best is trial 2 with value: 0.7940074906367042.
Trial done: Accuracy values on folds: [0.7808988764044944, 0.7640449438202247, 0.8202247191011236]
[I 2021-01-18 12:55:01,400] Trial 7 finished with value: 0.7865168539325843 and parameters: {'n_estimators': 599, 'max_depth': 10, 'learning_rate': 0.059923104176251296, 'subsample': 0.700148112703793, 'colsample_bytree': 0.6363612598018495, 'gamma': 0}. Best is trial 2 with value: 0.7940074906367042.
Trial done: Accuracy values on folds: [0.7584269662921348, 0.7808988764044944, 0.8202247191011236]
[I 2021-01-18 12:55:04,721] Trial 8 finished with value: 0.7808988764044944 and parameters: {'n_estimators': 612, 'max_depth': 13, 'learning_rate': 0.04360085562020063, 'subsample': 0.8957405544143879, 'colsample_bytree': 0.5065442877078434, 'gamma': 0}. Best is trial 2 with value: 0.7940074906367042.
Trial done: Accuracy values on folds: [0.7528089887640449, 0.7696629213483146, 0.8202247191011236]
[I 2021-01-18 12:55:06,894] Trial 9 finished with value: 0.7827715355805243 and parameters: {'n_estimators': 912, 'max_depth': 11, 'learning_rate': 0.09138974927861207, 'subsample': 0.7296031811437478, 'colsample_bytree': 0.7480620566932148, 'gamma': 0}. Best is trial 2 with value: 0.7940074906367042.
Trial done: Accuracy values on folds: [0.7696629213483146, 0.7640449438202247, 0.8146067415730337]
Number of finished trials: 10
Best trial:
Value: 0.7940074906367042
Params:
n_estimators: 630
max_depth: 12
learning_rate: 0.03605900595856281
subsample: 0.8552688103457784
colsample_bytree: 0.959344091561088
gamma: 0
best_param = study.best_params
best_param['scale_pos_weight'] =2.28
best_param['missing'] =-999
import plotly
optuna.visualization.plot_slice(study)
optuna.visualization.plot_optimization_history(study)
model = xgb.XGBClassifier(**best_param)
model.fit(X_train,y_train)
[12:55:55] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.3.0/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=0.959344091561088, gamma=0,
gpu_id=-1, importance_type='gain', interaction_constraints='',
learning_rate=0.03605900595856281, max_delta_step=0, max_depth=12,
min_child_weight=1, missing=-999, monotone_constraints='()',
n_estimators=630, n_jobs=8, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=2.28,
subsample=0.8552688103457784, tree_method='exact',
validate_parameters=1, verbosity=None)
predictions_final = model.predict(test.drop('Loan_ID',axis=1))
submission = pd.DataFrame({
"Loan_ID": test["Loan_ID"],
"Loan_Status":predictions_final
})
submission.to_csv('submission.csv', index=False)
predictions_final
array(['Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N',
'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
'Y', 'N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y',
'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y',
'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N',
'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N',
'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'N',
'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y',
'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N',
'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y',
'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
'N', 'Y', 'N'], dtype=object)